001    /*
002     * ScoringMatrix.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.alignment;
033    
034    import java.io.Reader;
035    import java.io.StreamTokenizer;
036    import java.io.IOException;
037    
038    /**
039     * This class implements a scoring scheme based on a substitution matrix. It is useful
040     * to represent PAM and BLOSUM family of amino acids scoring matrices. Its constructor
041     * loads such matrices from a file (or any other character stream). The following is an
042     * extract of a BLOSUM62 scoring matrix file:
043     * <CODE><BLOCKQUOTE><PRE>
044     *       A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
045     *    A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
046     *    R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
047     *    ...
048     *    B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
049     *    Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
050     *    X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
051     *    * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
052     * </PRE></BLOCKQUOTE></CODE>
053     *
054     * <P>Matrices are expected to follow this format. They must have one row an one column
055     * for each defined character (not necessarily in the same order). Each row and column
056     * must start with a distinct character (no repetition) and all row characters must have a
057     * correspondent column, and vice versa.</P>
058     *
059     * <P>Value at position (i,j) represent the score of substituting character of row i for
060     * character of column j. Insertion penalties are specified by the last row while deletion
061     * penalties must be located at the last column (both represented by the special character
062     * defined by the <CODE>INDEL_CHAR</CODE> constant). Note that it only supports an
063     * additive gap cost function. In case any of this rules are not followed, an
064     * {@linkplain InvalidScoringMatrixException} exception is raised by the constructor.</P>
065     *
066     * <P>If a scoring operation (substitution, insertion or deletion) involves a character
067     * not found in the matrix, an exception is raised.</P>
068     *
069     * @author Sergio A. de Carvalho Jr.
070     * @see InvalidScoringMatrixException
071     */
072    public class ScoringMatrix extends ScoringScheme
073    {
074            /**
075             * The character that indicates the row and column for insertion and deletion
076             * penalties in the matrix.
077             */
078            protected static final char INDEL_CHAR = '*';
079    
080            /**
081             * The character used to start a comment line in the scoring matrix file.
082             */
083            protected static final char COMMENT_CHAR = '#';
084    
085            /**
086             * Stores matrix column headers in the order they were found.
087             */
088            protected String col_codes;
089    
090            /**
091             * Stores matrix row headers in the order they were found.
092             */
093            protected String row_codes;
094    
095            /**
096             * Stores values for each operation (substitution, insertion or deletion) defined by
097             * this matrix.
098             */
099            protected int matrix[][];
100    
101            /**
102             * Dimension of the (squared) matrix.
103             */
104            protected int dimension;
105    
106            /**
107             * The maximum absolute score that this matrix can return for any substitution,
108             * deletion or insertion.
109             */
110            protected int max_absolute_score;
111    
112            /**
113             * Creates a new instance of a substitution matrix loaded from the character stream.
114             * The case of characters is significant when subsequently computing their score.
115             *
116             * @param input character stream from where the matrix is read
117             * @throws IOException if an I/O operation fails when reading from input
118             * @throws InvalidScoringMatrixException if the matrix does not comply with the
119             * specification
120             */
121            public ScoringMatrix (Reader input)
122                    throws IOException, InvalidScoringMatrixException
123            {
124                    this (input, true);
125            }
126    
127            /**
128             * Creates a new instance of a substitution matrix loaded from the character stream.
129             * If <CODE>case_sensitive</CODE> is <CODE>true</CODE>, the case of characters is
130             * significant when subsequently computing their score; otherwise the case is
131             * ignored.
132             *
133             * @param input character stream from where the matrix is read
134             * @param case_sensitive <CODE>true</CODE> if the case of characters must be
135             * @throws IOException if an I/O operation fails when reading from input
136             * @throws InvalidScoringMatrixException if the matrix does not comply with the
137             * specification
138             */
139            public ScoringMatrix (Reader input, boolean case_sensitive)
140                    throws IOException, InvalidScoringMatrixException
141            {
142                    super (case_sensitive);
143    
144                    StreamTokenizer in;
145                    StringBuffer    buf = new StringBuffer();
146                    int                     row, col, max_abs = 0;
147                    char                    c;
148    
149                    // create a stream tokenizer on top of the input
150                    // stream and set the COMMENT_CHAR as the comment character
151                    in = new StreamTokenizer(input);
152                    in.commentChar(COMMENT_CHAR);
153    
154                    // consider ends of line when reading the first row
155                    in.eolIsSignificant(true);
156    
157                    // skip blank lines (if any)
158                    for (in.nextToken(); in.ttype == StreamTokenizer.TT_EOL; in.nextToken());
159    
160                    // read first row: column character codes
161                    while ((in.ttype != StreamTokenizer.TT_EOF) &&
162                                    (in.ttype != StreamTokenizer.TT_EOL))
163                    {
164                            if (in.ttype == StreamTokenizer.TT_WORD)
165                            {
166                                    if (in.sval.length() > 1)
167                                            throw new InvalidScoringMatrixException
168                                                    ("Column headers must have one-character only.");
169    
170                                            buf.append(in.sval.charAt(0));
171                            }
172                            else if (in.ttype == INDEL_CHAR)
173                            {
174                                    buf.append(INDEL_CHAR);
175                            }
176                            else
177                            {
178                                    throw new InvalidScoringMatrixException("Column headers must be " +
179                                            "one-character codes or the special character '" + INDEL_CHAR + "'.");
180                            }
181    
182                            in.nextToken();
183                    }
184    
185                    // convert everything to upper case if it's not case sensitive
186                    if (case_sensitive)
187                            col_codes = buf.toString();
188                    else
189                            col_codes = buf.toString().toUpperCase();
190    
191                    dimension = col_codes.length();
192    
193                    // check if there's a column for deletion penalties
194                    if (col_codes.indexOf (INDEL_CHAR) == -1)
195                            throw new InvalidScoringMatrixException
196                                    ("Matrix have no column for deletion penalties.");
197    
198                    // check if there is at least one character code (besides the INDEL char)
199                    if (dimension < 2)
200                            throw new InvalidScoringMatrixException
201                                    ("Matrix must have at least one column with a character code.");
202    
203                    // check for repeated column codes
204                    for (int i = 0; i < dimension; i++)
205                            if (col_codes.indexOf(col_codes.charAt(i),i+1) > i)
206                                    throw new InvalidScoringMatrixException
207                                            ("Columns must have distinct one-character codes.");
208    
209                    // allocate matrix
210                    matrix = new int[dimension][dimension];
211    
212                    // reset buffer
213                    buf.delete (0, dimension);
214    
215                    // from now on, ignore ends of line
216                    in.eolIsSignificant(false);
217                    if (in.ttype == StreamTokenizer.TT_EOL) in.nextToken();
218    
219                    // read rest of matrix (one line for each character, but
220                    // not necessarily in the same order as the columns)
221                    for (row = 0; row < dimension && in.ttype != StreamTokenizer.TT_EOF; row++)
222                    {
223                            // start reading the line: the character code must come first
224                            if (in.ttype == StreamTokenizer.TT_WORD)
225                            {
226                                    if (in.sval.length() > 1)
227                                            throw new InvalidScoringMatrixException
228                                                    ("Codes must have one character only.");
229    
230                                    buf.append(in.sval.charAt(0));
231                            }
232                            else if (in.ttype == INDEL_CHAR)
233                            {
234                                    buf.append(INDEL_CHAR);
235                            }
236                            else
237                            {
238                                    throw new InvalidScoringMatrixException ("Rows must start with an" +
239                                            " one-character code or the special character '" + INDEL_CHAR + "'.");
240                            }
241    
242                            // now, the set of values
243                            for (col = 0; col < dimension; col++)
244                            {
245                                    // start reading the values
246                                    if (in.nextToken() != StreamTokenizer.TT_NUMBER)
247                                            throw new InvalidScoringMatrixException
248                                                    ("Invalid value at row " + (row+1) + ", column " + (col+1) + ".");
249    
250                                    matrix[row][col] = (int) in.nval;
251    
252                                    if (Math.abs(matrix[row][col]) > max_abs)
253                                            max_abs = Math.abs(matrix[row][col]);
254                            }
255    
256                            in.nextToken();
257                    }
258    
259                    // convert everything to upper case if it's not case sensitive
260                    if (case_sensitive)
261                            row_codes = buf.toString();
262                    else
263                            row_codes = buf.toString().toUpperCase();
264    
265                    // check if read as many rows as columns
266                    if (row_codes.length() != dimension)
267                            throw new InvalidScoringMatrixException
268                                    ("Matrix must have as many rows as columns.");
269    
270                    // check if there's a row for insertion penalties
271                    if (row_codes.indexOf(INDEL_CHAR) == -1)
272                            throw new InvalidScoringMatrixException
273                                    ("Matrix have no row for insertion penalties.");
274    
275                    // check for repeated row codes
276                    for (int i = 0; i < dimension; i++)
277                            if (row_codes.indexOf(row_codes.charAt(i),i+1) > i)
278                                    throw new InvalidScoringMatrixException
279                                            ("Rows must have distinct one-character codes.");
280    
281                    // check if all rows have a corresponding column
282                    for (int i = 0; i < dimension; i++)
283                            if (col_codes.indexOf(c = row_codes.charAt(i)) == -1)
284                                    throw new InvalidScoringMatrixException
285                                            ("There is no corresponding column for row character '" + c + "'.");
286    
287                    // store the maximum absolute value found
288                    this.max_absolute_score = max_abs;
289            }
290    
291            /**
292             * Returns the score of a substitution of character <CODE>a</CODE> for character
293             * <CODE>b</CODE> according to this scoring matrix.
294             *
295             * @param a first character
296             * @param b second character
297             * @return score of a substitution of character <CODE>a</CODE> for <CODE>b</CODE>
298             * @throws IncompatibleScoringSchemeException if this substitution is not defined
299             */
300            public int scoreSubstitution (char a, char b)
301                    throws IncompatibleScoringSchemeException
302            {
303                    int r,c;
304    
305                    if (case_sensitive)
306                    {
307                            r = row_codes.indexOf(a);
308                            c = col_codes.indexOf(b);
309                    }
310                    else
311                    {
312                            r = row_codes.indexOf(Character.toUpperCase(a));
313                            c = col_codes.indexOf(Character.toUpperCase(b));
314                    }
315    
316                    if (r < 0 || c < 0)
317                            throw new IncompatibleScoringSchemeException ("Substitution of character " +
318                                                                                                            a + " for " + b + " is not defined.");
319    
320                    return matrix[r][c];
321            }
322    
323            /**
324             * Returns the score of an insertion of character <CODE>a</CODE> according to this
325             * scoring matrix.
326             *
327             * @param a character to be inserted
328             * @return score of insertion of <CODE>a</CODE>
329             * @throws IncompatibleScoringSchemeException if this character is not recognised
330             */
331            public int scoreInsertion (char a) throws IncompatibleScoringSchemeException
332            {
333                    return scoreSubstitution (INDEL_CHAR, a);
334            }
335    
336            /**
337             * Returns the score of a deletion of character <CODE>a</CODE> according to this
338             * scoring matrix.
339             *
340             * @param a character to be deleted
341             * @return score of deletion of <CODE>a</CODE>
342             * @throws IncompatibleScoringSchemeException if this character is not recognised
343             */
344            public int scoreDeletion (char a) throws IncompatibleScoringSchemeException
345            {
346                    return scoreSubstitution (a, INDEL_CHAR);
347            }
348    
349            /**
350             * Tells whether this scoring scheme supports partial matches, which it does, although
351             * a particular scoring matrix loaded by this instace might not. A partial match is
352             * a situation when two characters are not equal but, for any reason, are regarded
353             * as similar by this scoring scheme, which then returns a positive score value. This
354             * is common for amino acid scoring matrices.
355             *
356             * @return always return <CODE>true</CODE>
357             */
358            public boolean isPartialMatchSupported ()
359            {
360                    return true;
361            }
362    
363            /**
364             * Returns the maximum absolute score that this scoring scheme can return for any
365             * substitution, deletion or insertion.
366             *
367             * @return maximum absolute score that can be returned
368             */
369            public int maxAbsoluteScore ()
370            {
371                    return max_absolute_score;
372            }
373    
374            /**
375             * Returns a String representation of this scoring matrix.
376             *
377             * @return a String representation of this scoring matrix
378             */
379            public String toString ()
380            {
381                    int row, col;
382    
383                    StringBuffer buf = new StringBuffer();
384    
385                    // column numbers
386                    buf.append("Scoring matrix:\n\t");
387                    for (col = 0; col < dimension; col++)
388                    {
389                            buf.append("\t" + col);
390                    }
391                    buf.append("\n\t");
392    
393                    // column headers
394                    for (col = 0; col < dimension; col++)
395                    {
396                            buf.append('\t');
397                            buf.append(col_codes.charAt(col));
398                    }
399    
400                    // rest of matrix
401                    for (row = 0; row < dimension; row++)
402                    {
403                            // row number and code
404                            buf.append("\n" + row + "\t" + row_codes.charAt(row));
405    
406                            for (col = 0; col < dimension; col++)
407                            {
408                                    buf.append('\t');
409                                    buf.append(matrix[row][col]);
410                            }
411                    }
412    
413                    return buf.toString();
414            }
415    }